R course for faculty

Presentation 5: Graphs

Jonas Moss

Graphics in R

There are three main options for doing graphics in R:

  • The “base” graphics in the graphics library (preinstalled).
  • The ggplot2 library, which is more flexible but harder to use.
  • The lattice library, which I am unfamiliar with.

We focus on base graphics.

Scatterplots

Goal

Getting started

plot(x = mtcars$mpg, 
     y = mtcars$disp)

plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement")

plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles")

Modify colors

plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = "red")

Change the color of the elements to cyan.

plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = "cyan")

Modify the box

Remove the box around the points completely. (The “axes” with numbers will stay there.) Use ?par and look for the bty parameter.

plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = "red",
     bty = "n")

Removing the axes

plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = "black",
     axes = FALSE,
     bty = "n")

Remove all text from the plot, leaving only the points.

plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "", 
     ylab = "", 
     main = "",
     col = "red",
     axes = FALSE,
     bty = "n")

Modify the point style

Make a scatterplot, with the same data, using triangles. To do this, find the pch code for a triangle (google, e.g., pch r there are multiple!) and make a plot.

plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = "black",
     pch = 17,
     cex = 1.6)

Color the important cars

Use rownames(mtcars) to color your three favorite cars in blue.

My favorite cars are Hornet 4 Drive and Hornet Sportabout. They have indices 4 and 5.

colors <- rep("black", length(mtcars$mpg))
colors[c(4, 5)] <- "blue"
plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = colors,
     pch = 20,
     cex = 1.6)

Point size of important cars

Change the point style of the red elements to a filled diamond.

colors <- rep("black", length(mtcars$mpg))
colors[c(14, 28)] <- "red"
cexes <- rep(1.6, length(mtcars$mpg))
cexes[c(14, 28)] <- 4
pchs <- rep(20, length(mtcars$mpg))
pchs[c(14,28)] <- 18
plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = colors,
     pch = pchs,
     cex = cexes)

Plotting in the right order

colors <- rep("black", length(mtcars$mpg))
colors[c(14, 28)] <- "red"
cexes <- rep(1.6, length(mtcars$mpg))
cexes[c(14, 28)] <- 4
pchs <- rep(20, length(mtcars$mpg))
pchs[c(14,28)] <- 18
plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = colors,
     pch = pchs,
     cex = cexes)
points(x = mtcars$mpg[-c(14, 28)], 
       y = mtcars$disp[-c(14, 28)],
       pch = 20,
       col = "black",
       cex = 1.6)

colors <- rep("black", length(mtcars$mpg))
colors[c(14, 28)] <- "red"
cexes <- rep(1.6, length(mtcars$mpg))
cexes[c(14, 28)] <- 4
pchs <- rep(20, length(mtcars$mpg))
pchs[c(14,28)] <- 18
plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = colors,
     pch = pchs,
     cex = cexes)
points(x = mtcars$mpg[-c(14, 28)], 
       y = mtcars$disp[-c(14, 28)],
       pch = 20,
       col = "black",
       cex = 1.6)

Using xlim and ylim

::: {.panel-tabset}

plot(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)], 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information",
     col = "red",
     pch = 18,
     cex = 4)
points(x = mtcars$mpg[-c(14, 28)], 
       y = mtcars$disp[-c(14, 28)],
       pch = 20,
       col = "black",
       cex = 1.6)

plot(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)], 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information",
     col = "red",
     pch = 18,
     cex = 4,
     xlim = c(min(mtcars$mpg), max(mtcars$mpg)),
     ylim = c(min(mtcars$disp), max(mtcars$disp)))
points(x = mtcars$mpg[-c(14, 28)], 
       y = mtcars$disp[-c(14, 28)],
       pch = 20,
       col = "black",
       cex = 1.6)

Adding transparency

Transparency can be added using the rbg function. The final argument is the level of transparency.

plot(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)], 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = rgb(1, 0, 0, 0.6),
     pch = 18,
     cex = 4,
     xlim = c(min(mtcars$mpg), max(mtcars$mpg)),
     ylim = c(min(mtcars$disp), max(mtcars$disp)))
points(x = mtcars$mpg[-c(14, 28)], 
       y = mtcars$disp[-c(14, 28)],
       pch = 20,
       col = "black",
       cex = 1.6)

Make the red points even more transparent. Add the black points to the plot.

plot(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)], 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = rgb(1, 0, 0, 0.9),
     pch = 18,
     cex = 4,
     xlim = c(min(mtcars$mpg), max(mtcars$mpg)),
     ylim = c(min(mtcars$disp), max(mtcars$disp)))
points(x = mtcars$mpg[-c(14, 28)], 
       y = mtcars$disp[-c(14, 28)],
       pch = 20,
       col = "black",
       cex = 1.6)

Gridlines

plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = "black",
     cex = 1.6,
     pch = 20,
     bty = "l")
grid()

Exercise

The grid line at approximately (15, 300) doesn’t look good, since it’s plotted above the points. Use the point function to fix the problem.

plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = "black",
     cex = 1.6,
     pch = 20,
     bty = "l")
grid()
points(x = mtcars$mpg, 
     y = mtcars$disp, 
     col = "black",
     cex = 1.6,
     pch = 20)

Adding lines with abline

plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = "black",
     cex = 1.6,
     pch = 20,
     bty = "l",
     xlim = c(0, 40))
grid()
points(x = mtcars$mpg, 
     y = mtcars$disp, 
     col = "black",
     cex = 1.6,
     pch = 20)

plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = "black",
     cex = 1.6,
     pch = 20,
     bty = "l",
     xlim = c(0, 40))
grid()
points(x = mtcars$mpg, 
     y = mtcars$disp, 
     col = "black",
     cex = 1.6,
     pch = 20)
abline(v = 10, lty = 1, lwd = 2, col = "red")

plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = "black",
     cex = 1.6,
     pch = 20,
     bty = "l",
     ylim = c(0, 500),
     xlim = c(0, 40))
grid()
points(x = mtcars$mpg, 
     y = mtcars$disp, 
     col = "black",
     cex = 1.6,
     pch = 20)
abline(v = 10, lty = 1, lwd = 2, col = "red")
abline(h = 50, lty = 1, lwd = 2, col = "blue")

Exercise

Inspect the documentation of abline and add a purple line with slope 1000 and intercept -50.

plot(x = mtcars$mpg, 
     y = mtcars$disp, 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = "black",
     cex = 1.6,
     pch = 20,
     bty = "l",
     ylim = c(0, 500),
     xlim = c(0, 40))
grid()
points(x = mtcars$mpg, 
     y = mtcars$disp, 
     col = "black",
     cex = 1.6,
     pch = 20)
abline(v = 10, lty = 1, lwd = 2, col = "red")
abline(h = 50, lty = 1, lwd = 2, col = "blue")
abline(a = 1000, b = -50, lwd = 2, col = "purple")

Adding labels

The text function adds labels to a plot.

plot(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)], 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = rgb(1, 0, 0, 0.6),
     pch = 18,
     cex = 4,
     xlim = c(min(mtcars$mpg), max(mtcars$mpg)),
     ylim = c(min(mtcars$disp), max(mtcars$disp)))
points(x = mtcars$mpg[-c(14, 28)], 
       y = mtcars$disp[-c(14, 28)],
       pch = 20,
       col = "black",
       cex = 1.6)
text(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)],
     labels = rownames(mtcars)[c(14, 28)])

The labels overlapp the points. We must fix this manually.

plot(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)], 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = rgb(1, 0, 0, 0.6),
     pch = 18,
     cex = 4,
     xlim = c(min(mtcars$mpg), max(mtcars$mpg)),
     ylim = c(min(mtcars$disp), max(mtcars$disp)),
     bty = "l")
points(x = mtcars$mpg[-c(14, 28)], 
       y = mtcars$disp[-c(14, 28)],
       pch = 20,
       col = "black",
       cex = 1.6)
text(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)],
     labels = rownames(mtcars)[c(14, 28)],
     pos = c(1, 3),
     offset = c(1,1))

Adding a scatterplot smoother

The loess function estimates a scatterplot smoother.

model <- loess(disp ~ mpg, data = mtcars)
plot(mtcars$mpg, predict(model))

Let’s use a line plot.

plot(mtcars$mpg, predict(model), type = "l")

We must sort the miles per gallons to get a good plot.

x <- sort(mtcars$mpg)
plot(x, predict(model, x), type = "l")

x <- sort(mtcars$mpg)
plot(x, predict(model, x), type = "l", lwd = 2)

x <- sort(mtcars$mpg)
plot(x, predict(model, x), type = "l", lwd = 2, lty = 3, col = "grey")

plot(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)], 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = rgb(1, 0, 0, 0.6),
     pch = 18,
     cex = 4,
     xlim = c(min(mtcars$mpg), max(mtcars$mpg)),
     ylim = c(min(mtcars$disp), max(mtcars$disp)),
     bty = "l")
points(x = mtcars$mpg[-c(14, 28)], 
       y = mtcars$disp[-c(14, 28)],
       pch = 20,
       col = "black",
       cex = 1.6)
text(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)],
     labels = rownames(mtcars)[c(14, 28)],
     pos = c(1, 3),
     offset = c(1,1))
x <- sort(mtcars$mpg)
lines(x, predict(model, x), lwd = 2, lty = 3, col = "grey")

Exercise

The scatterplot smoother is plotted above the data points, crearting an ugly effec (see mpg ~ 30). Modify the code so the points are plotted on top of the smoother.

model <- loess(disp ~ mpg, data = mtcars)
plot(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)], 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = rgb(1, 0, 0, 0.6),
     pch = 18,
     cex = 4,
     xlim = c(min(mtcars$mpg), max(mtcars$mpg)),
     ylim = c(min(mtcars$disp), max(mtcars$disp)),
     bty = "l")
points(x = mtcars$mpg[-c(14, 28)], 
       y = mtcars$disp[-c(14, 28)],
       pch = 20,
       col = "black",
       cex = 1.6)
text(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)],
     labels = rownames(mtcars)[c(14, 28)],
     pos = c(1, 3),
     offset = c(1,1))
x <- sort(mtcars$mpg)
lines(x, predict(model, x), lwd = 2, lty = 3, col = "grey")
legend("topright", legend = c("Scatterplot smoother", "Unimportant cars", "Important cars"), col = c("grey", "black", "red"))

plot(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)], 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     col = rgb(1, 0, 0, 0.6),
     pch = 18,
     cex = 4,
     xlim = c(min(mtcars$mpg), max(mtcars$mpg)),
     ylim = c(min(mtcars$disp), max(mtcars$disp)),
     bty = "l")
points(x = mtcars$mpg[-c(14, 28)], 
       y = mtcars$disp[-c(14, 28)],
       pch = 20,
       col = "black",
       cex = 1.6)
text(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)],
     labels = rownames(mtcars)[c(14, 28)],
     pos = c(1, 3),
     offset = c(1,1))
x <- sort(mtcars$mpg)
lines(x, predict(model, x), lwd = 2, lty = 3, col = "grey")
legend("topright", 
       legend = c("Scatterplot smoother", "Unimportant cars", "Important cars"),
       col = c("grey", "black", "red"), 
       lwd = c(2, NA, NA),
       pch = c(NA, 20, 16),
       lty = c(3, NA, NA))

plot(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)], 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     sub = "One gallon equals ~3.8 litres",
     col = rgb(1, 0, 0, 0.6),
     pch = 18,
     cex = 4,
     xlim = c(min(mtcars$mpg), max(mtcars$mpg)),
     ylim = c(min(mtcars$disp), max(mtcars$disp)),
     bty = "l")
points(x = mtcars$mpg[-c(14, 28)], 
       y = mtcars$disp[-c(14, 28)],
       pch = 20,
       col = "black",
       cex = 1.6)
text(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)],
     labels = rownames(mtcars)[c(14, 28)],
     pos = c(1, 3),
     offset = c(1,1))
x <- sort(mtcars$mpg)
lines(x, predict(model, x), lwd = 2, lty = 3, col = "grey")
legend("topright", 
       legend = c("Scatterplot smoother", "Unimportant cars", "Important cars"),
       col = c("grey", "black", rgb(1, 0, 0, 0.6)), 
       lwd = c(2, NA, NA),
       pch = c(NA, 20, 18),
       lty = c(3, NA, NA),
       pt.cex = c(0, 1, 4/1.6),
       bty = "n")

Exercise

Modify the code so that the “Scatterplot smoother” appears at the bottom of the legend, not the top.

Saving the graph

pdf(file = "cars.pdf", width = 8, height = 8) 

plot(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)], 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     main = "Important information concerning automobiles",
     sub = "One gallon equals ~3.8 litres",
     col = rgb(1, 0, 0, 0.6),
     pch = 18,
     cex = 4,
     xlim = c(min(mtcars$mpg), max(mtcars$mpg)),
     ylim = c(min(mtcars$disp), max(mtcars$disp)),
     bty = "l")
points(x = mtcars$mpg[-c(14, 28)], 
       y = mtcars$disp[-c(14, 28)],
       pch = 20,
       col = "black",
       cex = 1.6)
text(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)],
     labels = rownames(mtcars)[c(14, 28)],
     pos = c(1, 3),
     offset = c(1,1))
x <- sort(mtcars$mpg)
lines(x, predict(model, x), lwd = 2, lty = 3, col = "grey")
legend("topright", 
       legend = c("Scatterplot smoother", "Unimportant cars", "Important cars"),
       col = c("grey", "black", rgb(1, 0, 0, 0.6)), 
       lwd = c(2, NA, NA),
       pch = c(NA, 20, 18),
       lty = c(3, NA, NA),
       pt.cex = c(0, 1, 4/1.6),
       bty = "n")

dev.off()
png 
  2 
pdf(file = "cars.pdf", width = 8, height = 8) 

plot(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)], 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     sub = "One gallon equals ~3.8 litres",
     col = rgb(1, 0, 0, 0.6),
     pch = 18,
     cex = 4,
     xlim = c(min(mtcars$mpg), max(mtcars$mpg)),
     ylim = c(min(mtcars$disp), max(mtcars$disp)),
     cex.lab = 1.5,
     cex.main = 2,
     bty = "l")
points(x = mtcars$mpg[-c(14, 28)], 
       y = mtcars$disp[-c(14, 28)],
       pch = 20,
       col = "black",
       cex = 1.6)
text(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)],
     labels = rownames(mtcars)[c(14, 28)],
     pos = c(1, 3),
     offset = c(1,1))
x <- sort(mtcars$mpg)
lines(x, predict(model, x), lwd = 3, lty = 3, col = "grey")
legend("topright", 
       legend = c("Scatterplot smoother", "Unimportant cars", "Important cars"),
       col = c("grey", "black", rgb(1, 0, 0, 0.6)), 
       lwd = c(3, NA, NA),
       pch = c(NA, 20, 18),
       lty = c(3, NA, NA),
       pt.cex = c(0, 1, 4/1.6),
       cex = c(1.3,1.3,1.3),
       bty = "n")

dev.off()
png 
  2 

Some journals do not accept pdf graphs.

  • BMP, JPEG, PNG and TIFF. Avoid these if possible, as they are lossy.
  • eps is a PostScript format that is often accepted.
  • svg is probably the preferred format.
svg("cars.svg", width = 8, height = 8) 

plot(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)], 
     xlab = "Miles per gallon", 
     ylab = "Displacement", 
     sub = "One gallon equals ~3.8 litres",
     col = rgb(1, 0, 0, 0.6),
     pch = 18,
     cex = 4,
     xlim = c(min(mtcars$mpg), max(mtcars$mpg)),
     ylim = c(min(mtcars$disp), max(mtcars$disp)),
     cex.lab = 1.5,
     cex.main = 2,
     bty = "l")
points(x = mtcars$mpg[-c(14, 28)], 
       y = mtcars$disp[-c(14, 28)],
       pch = 20,
       col = "black",
       cex = 1.6)
text(x = mtcars$mpg[c(14, 28)], 
     y = mtcars$disp[c(14, 28)],
     labels = rownames(mtcars)[c(14, 28)],
     pos = c(1, 3),
     offset = c(1,1))
x <- sort(mtcars$mpg)
lines(x, predict(model, x), lwd = 3, lty = 3, col = "grey")
legend("topright", 
       legend = c("Scatterplot smoother", "Unimportant cars", "Important cars"),
       col = c("grey", "black", rgb(1, 0, 0, 0.6)), 
       lwd = c(3, NA, NA),
       pch = c(NA, 20, 18),
       lty = c(3, NA, NA),
       pt.cex = c(0, 1, 4/1.6),
       cex = c(1.3,1.3,1.3),
       bty = "n")

dev.off()
png 
  2 

Exercise

In Norway we often use liter per 10 kilometers (Norwegian mile). Make a variable lpt that contains the liter per 10 kilometers. Use the conversion factor gallon = 3.78531.

Plotting parameters

Documentation

  • xlab
  • ylab
  • main

Multiple plots

  • lines
  • points
  • add

Styling

  • color
  • type
  • lty
  • pch

Statistical plots

  • boxplot
  • barplot
  • hist

Box plots

boxplot(mtcars$hp, mtcars$disp)

boxplot(mtcars$hp, mtcars$disp, names = c("Horsepower", "Displacement"))

Exercise

Change the color of hp to green and displacement to yellow.

boxplot(mtcars$hp, mtcars$disp, names = c("Horsepower", "Displacement"), col = c("green", "yellow"))

Exercise

Add an informative title to the plot. Change the colors back to something pleasant.

boxplot(mtcars$hp, mtcars$disp, names = c("Horsepower", "Displacement"), col = c("white", "white"), main = "Distribution of car properties")

Exercise

Make a boxplot of wt, drat, and cyl. Add labels and title.

boxplot(mtcars$wt, mtcars$drat, mtcars$cyl, names = c("Weight", "Rear axle ratio", "Cylinders"), col = "white", main = "Distribution of car properties")

Bar plots

barplot(cylinders)
Error: object 'cylinders' not found

Histograms

hist(mtcars$mpg)

hist(mtcars$mpg, xlab = "Miles per gallon", main = "Histogram of miles per gallon")

Exercise

Change the color of the histogram to white. Make the histogram a density histogram, i.e., the integral should integrate to 1. (By default, it shows counts.)

hist(mtcars$mpg, xlab = "Miles per gallon", main = "Histogram of miles per gallon", color = "white", freq = FALSE)

Exercise

The default value for breaks (the number of bins + 1) is too small by default, as R uses an outdated method for calculating the breaks. Experiment with different breaks and choose a “pleasing” one.

hist(mtcars$mpg, xlab = "Miles per gallon", main = "Histogram of miles per gallon", color = "white", freq = FALSE, breaks = 10)

Exercise

Add a call to rug on mtcars$mpg below the histogram.

hist(mtcars$mpg, xlab = "Miles per gallon", main = "Histogram of miles per gallon", col = "white", freq = FALSE, breaks = 10)
rug(mtcars$mpg)

We are sometimes interested in evaluating wether a dataset is approximately normal. We can do this visually (but unreliably!) by plotting a normal density curve on top of a histogram.

mu <- mean(mtcars$mpg)
sigma <- sd(mtcars$mpg)
x <- seq(min(mtcars$mpg), max(mtcars$mpg), length.out = 100)
x
  [1] 10.40000 10.63737 10.87475 11.11212 11.34949 11.58687 11.82424 12.06162
  [9] 12.29899 12.53636 12.77374 13.01111 13.24848 13.48586 13.72323 13.96061
 [17] 14.19798 14.43535 14.67273 14.91010 15.14747 15.38485 15.62222 15.85960
 [25] 16.09697 16.33434 16.57172 16.80909 17.04646 17.28384 17.52121 17.75859
 [33] 17.99596 18.23333 18.47071 18.70808 18.94545 19.18283 19.42020 19.65758
 [41] 19.89495 20.13232 20.36970 20.60707 20.84444 21.08182 21.31919 21.55657
 [49] 21.79394 22.03131 22.26869 22.50606 22.74343 22.98081 23.21818 23.45556
 [57] 23.69293 23.93030 24.16768 24.40505 24.64242 24.87980 25.11717 25.35455
 [65] 25.59192 25.82929 26.06667 26.30404 26.54141 26.77879 27.01616 27.25354
 [73] 27.49091 27.72828 27.96566 28.20303 28.44040 28.67778 28.91515 29.15253
 [81] 29.38990 29.62727 29.86465 30.10202 30.33939 30.57677 30.81414 31.05152
 [89] 31.28889 31.52626 31.76364 32.00101 32.23838 32.47576 32.71313 32.95051
 [97] 33.18788 33.42525 33.66263 33.90000
plot(x, dnorm(x, mu, sigma), type = "l")

Exercise

Using lines, add the normal density plot to the histogram. Be sure to choose a nice color and line type (lty)!

hist(mtcars$mpg, xlab = "Miles per gallon", main = "Histogram of miles per gallon", col = "white", freq = FALSE, breaks = 10)
rug(mtcars$mpg)
lines(x, dnorm(x, mu, sigma), lty = 3, col = "black", lwd = 2)

Exercise

The Shapiro–Wilk test is among the most widely used tests of normality. Find the Shapiro–Wilk test in R and evaluate the normality of mtcars$mpg formally.

shapiro.test(mtcars$mpg)

    Shapiro-Wilk normality test

data:  mtcars$mpg
W = 0.94756, p-value = 0.1229

We can’t formally reject normality. But it’s not very plausible in the first place.

Quantile-quantile plots

qqnorm(mtcars$mpg)

qqnorm(mtcars$mpg)
qqline(mtcars$mpg)

Exercise

  1. The default value of main is a string now. Remove the title from the plot. (ii) Change the “box” around the plot so it has lines emanating only from the lower-left corner.
qqnorm(mtcars$mpg, main = NULL, bty = "l")
qqline(mtcars$mpg)

Plots side by side

par(mfrow = c(2,2))
hist(mtcars$mpg, xlab = "Miles per gallon", col = "white", main = NULL, freq = FALSE, breaks = 10)
rug(mtcars$mpg)
lines(x, dnorm(x, mu, sigma), lty = 3, col = "black", lwd = 2)
qqnorm(mtcars$mpg, main = NULL, bty = "l")
qqline(mtcars$mpg)
boxplot(mtcars$mpg, xlab = "Miles per gallon")
plot(mtcars$mpg, mtcars$disp, xlab = "Miles per gallon", ylab = "Displacement", pch = 20)

plot is generic

The plot function is generic. It can be applied to many different objects, with sometimes unpredictable behaviour!

properties <- datarium::properties
head(properties)
  property_type  buyer_type
1          flat single male
2          flat single male
3          flat single male
4          flat single male
5          flat single male
6          flat single male
property_table <- table(properties$property_type, properties$buyer_type)
property_table
                
                 single male single female married couple family
  flat                    40            30             16     10
  bungalow                 4             4             14     16
  detached house           8            16             26     42
  terrace                 16             7             45     39
plot(property_table)

To find the documentation of a generic, you must know the class of the object you use.

class(property_table)
[1] "table"

Instances of generic functions can be found by placing a . behind the generic method:

?plot.table

Exercise

Call plot(dnorm, -5, 5). What does the plot function do now? Find its documentation.